Introduction
Earlier this year, BBC Music asked more than 100 critics, artists, and other music industry experts for their five favorite hip-hop tracks. The complete poll results together with information about the voters are available at the #tidytuesday Github repository.
Let us explore the data sets, see which are the most rated songs, and explore their audio features such as loudness, danceability and instrumentalness. We start by loading the packages and some functions required for the analysis.
library(tidyverse)
require(maps)
library(here)
library(pdftools)
library(scales)
library(treemapify)
library(spotifyr)
library(ggimage)
library(ggcorrplot)
library(tidytext)
library(nFactors)
library(psych)
library(cowplot)
library(ggrepel)
library(RColorBrewer)
# Define a custom theme for this project
library(showtext)
font_add_google("Montaga", "Montaga")
showtext_auto()
# trace(grDevices::png, exit = quote({
# showtext::showtext_begin()
# }), print = FALSE)
# untrace(grDevices::png)
mygray <- "#F8F7FF"
cols <- c("#404664", "#726CC6", "#AAA7DD", "#D3D3EE", "#FBE8DA")
theme_set(theme_light())
theme_update(text = element_text(color = "black", family = "Montaga"),
panel.grid = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(fill = mygray),
plot.background = element_rect(fill = mygray),
strip.background = element_rect(fill = mygray),
plot.title = element_text(size = 30),
plot.subtitle = element_text(size = 18),
plot.caption = element_text(size = 13),
axis.text = element_text(size = 18),
axis.title = element_text(size = 22),
axis.ticks = element_blank(),
legend.position = "bottom",
legend.title = element_text(size = 20),
legend.text = element_text(size=15),
legend.background = element_blank(),
legend.box.background = element_blank(),
strip.text = element_text(size = 17),
strip.text.x = element_text(colour = "black"))
show_table <- function(x, caption = "", head = 50, scroll = FALSE, full.width = FALSE,
digits = 2, col.names = NA, align = NULL){
table <- x %>%
head(head) %>%
kable(caption = caption, digits = digits, col.names = col.names, align = align,
format.args = list(decimal.mark = ".", big.mark = "")) %>%
kable_styling("striped", position = "left", full_width = full.width)
if(scroll){
table <- table %>%
scroll_box(width = "100%", height = "500px")
}
return(table)
}
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
frequency_table <- function(df, group_var, align = NULL, prop = TRUE,
head = nrow(df), caption = ""){
group_var <- enquo(group_var)
col.names <- c(firstup(as_label(group_var)), "Frequency")
table <- df %>%
group_by(!! group_var) %>%
summarize(n = n()) %>%
arrange(desc(n))
if(prop){
col.names <- c(col.names, "Proportion")
table <- table %>%
mutate(prop = n / sum(n),
prop = percent(prop))
}
table %>%
show_table(col.names = col.names, align = align, head = head, caption = caption)
}The available data sets are polls.csv and rankings.csv. Let us have a look at these data sets.
polls <- read_delim('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-14/polls.csv', delim = ",") %>%
select(rank : critic_country) %>%
mutate_if(is.double, as.integer) %>%
mutate(gender = as.factor(gender))
rankings <- read_delim('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-14/rankings.csv', delim = ",") %>%
mutate_if(is.double, as.integer)Polls
The polls data set gathers information about the voters (that is, name, occupation, and country of origin), their five songs from most to least favorite, the singers’ names and gender, and the release years of the songs.
polls %>%
show_table(head = 20, align = c(rep("l", 3), rep("r", 5)), scroll = TRUE,
caption = "Polls data set (first 20 rows)")| rank | title | artist | gender | year | critic_name | critic_rols | critic_country |
|---|---|---|---|---|---|---|---|
| 1 | Terminator X To The Edge of Panic | Public Enemy | male | 1998 | Joseph Abajian | Fat Beats | US |
| 2 | 4th Chamber | Gza ft. Ghostface Killah & Killah Priest & RZA | male | 1995 | Joseph Abajian | Fat Beats | US |
| 3 | Peter Piper | Run DMC | male | 1986 | Joseph Abajian | Fat Beats | US |
| 4 | Play That Beat Mr DJ | GLOBE & Whiz Kid | male | 2001 | Joseph Abajian | Fat Beats | US |
| 5 | Time’s Up | O.C. | male | 1994 | Joseph Abajian | Fat Beats | US |
| 1 | Players | Slum Village | male | 1997 | Biba Adams | Critic | US |
| 2 | Self Destruction | Stop The Violence Movement | mixed | 1989 | Biba Adams | Critic | US |
| 3 | Push It | Salt-N-Pepa | female | 1986 | Biba Adams | Critic | US |
| 4 | Ambitionz Az A Ridah | 2Pac | male | 1996 | Biba Adams | Critic | US |
| 5 | Big Pimpin’ | JAY-Z ft. UGK | male | 1999 | Biba Adams | Critic | US |
| 1 | Rapper’s Delight | Sugarhill Gang | male | 1979 | Dart Adams | Critic | US |
| 2 | Sucker MCs | Run DMC | male | 1984 | Dart Adams | Critic | US |
| 3 | Lyrics Of Fury | Eric B & Rakim | male | 1988 | Dart Adams | Critic | US |
| 4 | Rebel Without A Pause | Public Enemy | male | 1988 | Dart Adams | Critic | US |
| 5 | The Message | Grandmaster Flash & The Furious Five | male | 1982 | Dart Adams | Critic | US |
| 1 | Juicy | The Notorious B.I.G. | male | 1994 | Insanul Ahmed | Genius | US |
| 2 | Nuthin’ But A ‘G’ Thang | Dr Dre ft. Snoop Doggy Dogg | male | 1992 | Insanul Ahmed | Genius | US |
| 3 | The Message | Grandmaster Flash & The Furious Five | male | 1982 | Insanul Ahmed | Genius | US |
| 4 | In Da Club | 50 Cent | male | 2003 | Insanul Ahmed | Genius | US |
| 5 | m.A.A.d. city | Kendrick Lamar | male | 2012 | Insanul Ahmed | Genius | US |
Rankings
The rankings data set reports some metadata, including the name of the artists, the song release year, and how many voters picked each song among their favorites. The points variable gives each song a total score that takes into account where the song is positioned in the voters’ rankings. Each track is awarded ten points if it ranks first, eight points if it ranks second, and so on down to two points for fifth place.
rankings %>%
show_table(head = 20, align = c(rep("l", 3), rep("r", 5)), scroll = TRUE,
caption = "Rankings data set (first 20 rows)")| ID | title | artist | year | gender | points | n | n1 | n2 | n3 | n4 | n5 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Juicy | The Notorious B.I.G. | 1994 | male | 140 | 18 | 9 | 3 | 3 | 1 | 2 |
| 2 | Fight The Power | Public Enemy | 1989 | male | 100 | 11 | 7 | 3 | 1 | 0 | 0 |
| 3 | Shook Ones (Part II) | Mobb Deep | 1995 | male | 94 | 13 | 4 | 5 | 1 | 1 | 2 |
| 4 | The Message | Grandmaster Flash & The Furious Five | 1982 | male | 90 | 14 | 5 | 3 | 1 | 0 | 5 |
| 5 | Nuthin’ But A ‘G’ Thang | Dr Dre ft. Snoop Doggy Dogg | 1992 | male | 84 | 14 | 2 | 4 | 2 | 4 | 2 |
| 6 | C.R.E.A.M. | Wu-Tang Clan | 1993 | male | 62 | 10 | 3 | 1 | 1 | 4 | 1 |
| 7 | 93 ’Til Infinity | Souls of Mischief | 1993 | male | 50 | 7 | 2 | 2 | 2 | 0 | 1 |
| 8 | Passin’ Me By | The Pharcyde | 1992 | male | 48 | 6 | 3 | 2 | 0 | 0 | 1 |
| 9 | N.Y. State Of Mind | Nas | 1994 | male | 46 | 7 | 1 | 3 | 1 | 1 | 1 |
| 10 | Dear Mama | 2Pac | 1995 | male | 42 | 6 | 2 | 1 | 1 | 2 | 0 |
| 11 | Runaway | Kanye West ft. Pusha T | 2010 | male | 38 | 5 | 2 | 0 | 3 | 0 | 0 |
| 12 | Paid In Full | Eric B & Rakim | 1987 | male | 36 | 5 | 1 | 1 | 3 | 0 | 0 |
| 13 | Rapper’s Delight | Sugarhill Gang | 1979 | male | 36 | 4 | 2 | 2 | 0 | 0 | 0 |
| 14 | They Reminisce Over You (T.R.O.Y.) | Pete Rock & C.L. Smooth | 1992 | male | 34 | 6 | 1 | 0 | 2 | 3 | 0 |
| 15 | Fuck Tha Police | NWA | 1988 | male | 32 | 5 | 1 | 1 | 2 | 0 | 1 |
| 16 | Electric Relaxation | A Tribe Called Quest | 1993 | male | 32 | 5 | 0 | 3 | 1 | 0 | 1 |
| 17 | B.O.B. | OutKast | 2000 | male | 32 | 4 | 2 | 1 | 0 | 1 | 0 |
| 18 | It Was A Good Day | Ice Cube | 1992 | male | 30 | 5 | 2 | 0 | 1 | 0 | 2 |
| 19 | U.N.I.T.Y. | Queen Latifah | 1993 | female | 30 | 5 | 1 | 1 | 1 | 1 | 1 |
| 20 | Doo Wop (That Thing) | Lauryn Hill | 1998 | female | 28 | 5 | 1 | 1 | 1 | 0 | 2 |
Exploratory Analysis
Before delving into the Hip Hop songs, let’s have a look at the voters first. The voters come from 13 countries across the six continents: North America, South America, Europe, Africa, Asia, and Australia. The majority of the voters contacted by BBC Music are from the United States, as that is the region where hip-hop culturally originated.
df_countries <- polls %>%
distinct(critic_name, .keep_all = TRUE) %>%
count(critic_country, sort = TRUE) %>%
rename(region = critic_country) %>%
mutate(region = ifelse(region == "Russian Federation", "Russia", region),
region = ifelse(region == "US", "USA", region))
df_countries %>%
show_table(col.names = c("Country", "Frequency"), align = c("l", "r"),
caption = "Distribution of voters across countries")| Country | Frequency |
|---|---|
| USA | 73 |
| Germany | 9 |
| South Africa | 5 |
| UK | 5 |
| Canada | 2 |
| China | 2 |
| Colombia | 2 |
| Japan | 2 |
| Kenya | 2 |
| New Zealand | 2 |
| India | 1 |
| Nigeria | 1 |
| Russia | 1 |
map.world_joined <- map_data("world") %>%
left_join(df_countries, by = "region")
region.lab.data <- map.world_joined %>%
filter(!is.na(n)) %>%
group_by(region) %>%
summarise(long = mean(long), lat = mean(lat))
plot.map <- map.world_joined %>%
mutate(n = ifelse(is.na(n), FALSE, TRUE)) %>%
ggplot(aes(long, lat)) +
geom_polygon(aes(group = group, fill = n), show.legend = FALSE) +
geom_text(aes(label = region), data = region.lab.data, size = 6.5, family = "Montaga") +
scale_fill_manual(name = "Frequency", values = c("gray70", "#F26989")) +
labs(title = "Voters' countries of origin", x = NULL, y = NULL,
subtitle = "Almost 70% of voters come from the United States.",
caption = "source: BBC Music, TidyTuesday 2020|week 16") +
theme(axis.text = element_blank())
ggsave(here("2020", "week16", "Plots", "Critics_map.pdf"), plot = plot.map,
width = 11, height = 7, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Critics_map.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Critics_map.png"),
verbose = FALSE)Fifty out of 107 voters are music critics. The occupations of the remaining voters are shown in the treemap below.
plot.roles <- polls %>%
distinct(critic_name, .keep_all = TRUE) %>%
count(critic_rols, sort = TRUE) %>%
slice(-1) %>%
ggplot(aes(area = n, label = critic_rols, fill = n)) +
geom_treemap(alpha = 0.6) +
geom_treemap_text(place = "centre", size = 20, min.size = 13,
grow = FALSE, reflow = TRUE, family = "Montaga") +
scale_fill_gradientn(colors = rev(c("#404664", "#726CC6", "#AAA7DD", "#D3D3EE")),
name = "Frequency\n",
guide = guide_colorbar(label = TRUE, frame.colour = "black",
label.position = "bottom", barwidth = 8,
barheight = 1, direction = 'horizontal')) +
labs(title = "Voters' Occupations",
subtitle = "Excluding the 'Critic' role, which makes up for half of the whole jobs (50 out of 107).",
caption = "source: BBC Music, TidyTuesday 2020|week 16") +
theme(plot.background = element_rect(fill = NA))
ggsave(here("2020", "week16", "Plots", "Critics_roles.pdf"), plot = plot.roles,
width = 12, height = 10, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Critics_roles.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Critics_roles.png"),
verbose = FALSE)Now that we have inspected the critics, let’s have a look at the songs and singers. Most Hip Hop artists are male. Female artists and featured collaborations come almost in a tie and jointly account for about 20% of the greatest songs.
polls %>%
distinct(artist, .keep_all = TRUE) %>%
frequency_table(gender, align = c("l", "r", "r"),
caption = "The gender of the best Hip Hop artists according to the BBC Music poll")| Gender | Frequency | Proportion |
|---|---|---|
| male | 170 | 82.1% |
| mixed | 19 | 9.2% |
| female | 18 | 8.7% |
The best Hip Hop songs were released in the last 30 years. The oldest song is “Rapper’s Delight” by the Sugarhill Gang, and it is dated back to 1979. While it was not the first single to include rapping, “Rapper’s Delight” is credited for introducing hip hop music to a wide audience. By 1979 hip hop music had become a mainstream genre. The most rated songs were released from the early to the late Nineties. This period is considered the Golden Age of Hip Hop.
text.color <- "#46494c"
df_shades <- data.frame(xmin = c(-Inf, 1983, 1986, 1997, 2006, 2014),
xmax = c(1983, 1986, 1997, 2006, 2014, Inf),
ymin = rep(0, 6),
ymax = rep(Inf, 6),
fill = rep(c("#ABA7DD", "#F9DEC9"), times = 3))
df_text <- data.frame(x = c(1980.5, 1984.5, 1991.5, 2001.5, 2010, 2017),
y = c(20, 20, 20.7, 20.7, 20, 20),
label = c("Old\nSchool", "New\nSchool", "Golden Age", "Bling Era",
"Alternative\nand Electronic", "Trap and\nMumble Rap"))
plot.hiphop.periods <- polls %>%
distinct(title, .keep_all = TRUE) %>%
count(year) %>%
ggplot(aes(year, n)) +
# Shaded boxes for hip hop periods
annotate("rect", xmin = df_shades$xmin, xmax = df_shades$xmax,
ymin = df_shades$ymin, ymax = df_shades$ymax, fill = df_shades$fill, alpha = 0.6) +
# Text annotations for periods
annotate("text", x = df_text$x, y = df_text$y, label = df_text$label, size = 7, colour = text.color, family = "Montaga") +
# Annotation for Rapper's Delight
annotate("segment", x = 1979, xend = 1979, y = 3.4, yend = 1.5, colour = text.color,
arrow = arrow(length=unit(0.1, "cm"))) +
annotate("text", x = 1979 + 2.2, y = 4, label = "Rapper's Delight", size = 6, colour = text.color,
family = "Montaga") +
# Bar plot
geom_col(fill = "#889690", width = 0.8, color = "black") +
labs(title = "Release year of the best Hip Hop songs",
x = "Release Year", y = "Number of released songs",
subtitle = "The Nineties are considered the Golden Age of Hip Hop.",
caption = "source: BBC Music, TidyTuesday 2020|week 16") +
scale_x_continuous(breaks = c(1979, 1983, 1986, 1997, 2006, 2014, 2019), expand = c(0.008, 0.008)) +
scale_y_continuous(expand = c(0, 0, 0.02, 0))
ggsave(here("2020", "week16", "Plots", "HipHop_periods.pdf"), plot = plot.hiphop.periods,
width = 13, height = 7, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "HipHop_periods.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "HipHop_periods.png"),
verbose = FALSE)Let’s have a look at the songs with the highest points. The points were awarded in the following way: 10 points for the first ranked track, eight points for the second-ranked track, and so on down to two points for the fifth place.
At the top, we find “Juicy” by The Notorious B.I.G. with 140 points. The song traces the story of Notorious B.I.G., from his childhood years living in poverty, his dreams of becoming a rapper, the early musical influences, his time dealing drugs, criminal involvement, and his eventual success in the music industry and current lavish lifestyle.
In the second position, we find “Fight the Power” by Public Enemy. The song, which also appeared as a soundtrack in the film “Do the Right Thing”, alludes to African-American culture, civil rights exhortations, black church services, and the music of James Brown.
The most rated song by a female artist is “U.N.I.T.Y.” by “Queen Latifah”, whereas the most rated song by a band is “Ready Or Not” by The Fugees.
cols.gender <- c("#ff006e", "#3a86ff", "#ffbe0b")
plot.rank.gender <- rankings %>%
head(30) %>%
mutate(title = fct_reorder(title, points),
gender = as.factor(gender)) %>%
ggplot(aes(title, points, fill = gender)) +
geom_segment(aes(x = title, xend = title, y = 0, yend = points, color = gender), size = 1.4, show.legend = FALSE) +
geom_point(aes(color = gender), size = 4) +
scale_y_continuous(breaks = c(seq(0, 150, by = 25)), expand = c(0.008, 0.008)) +
scale_fill_manual(name = "Artist(s) gender", values = cols.gender) +
scale_color_manual(name = "Artist(s) gender", values = cols.gender) +
labs(title = "The greatest Hip Hop songs of all time", y = "Number of points awarded", x = "",
subtitle = "Among the songs with the 30 highest scores, there are two by female artists, and one by a group.",
caption = "source: BBC Music, TidyTuesday 2020|week 16") +
coord_flip() +
theme(panel.border = element_rect(fill = NA, colour = "grey70"),
panel.grid = element_line(colour = "grey87"))
ggsave(here("2020", "week16", "Plots", "Ranking_gender.pdf"), plot = plot.rank.gender,
width = 15, height = 12, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Ranking_gender.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Ranking_gender.png"),
verbose = FALSE)The following bar chart breaks down the rankings associated with the songs that received at least five votes.
plot.stacked.rank <- rankings %>%
inner_join(rankings_df, by = c("title", "artist", "year", "gender", "points")) %>%
select(ID:n5, url) %>%
pivot_longer(cols = n1 : n5,
names_to = "rank",
values_to = "count") %>%
mutate(rank = str_sub(rank, start = 2),
rank = factor(rank, levels = sort(unique(rank), decreasing = TRUE)),
title = paste(title, "\n", artist),
title = fct_reorder(title, n)) %>%
filter(n >= 5) %>%
ggplot(aes(x = count, y = title, fill = rank)) +
geom_col(width=0.6, color = "black") +
geom_image(aes(x = n + 0.5, y = title, image = url), size = 0.029, asp = 1.375) +
scale_fill_manual(name = "Ranking", values = cols, breaks = c("1", "2", "3", "4", "5"),
labels = c("First", "Second", "Third", "Fourth", "Fifth"),
guide = guide_legend(direction = "horizontal", title.position = "top",
label.position = "bottom")) +
labs(title = "The greatest Hip Hop songs of all time",
subtitle = "Songs with at least 5 votes",
caption = "source: BBC Music, TidyTuesday 2020|week 16",
y = NULL, x = "Number of votes received by a pool of 107 critics") +
scale_x_continuous(limits = c(0,19), expand = c(0, 0)) +
theme(legend.position = c(0.75, 0.5),
axis.text.y = element_text(face="bold", color ="black"),
axis.text.x = element_text(size = 20, color = "black"),
axis.title.x = element_text(size = 22, color = "black"),
plot.caption = element_text(size = 18))
ggsave(here("2020", "week16", "Plots", "Ranking_stacked.pdf"), plot = plot.stacked.rank,
width = 17.5, height = 16, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Ranking_stacked.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Ranking_stacked.png"),
verbose = FALSE)Audio features from Spotify
We can get the audio feature of the Hip Hop songs from the Spotify API. The first step is associating the Spotify IDs to the songs in ratings.
pull_id <- function(query) {
# retrieve the id from a query (song title and artist name)
search_spotify(query, "track") %>%
arrange(-popularity) %>%
filter(row_number() == 1) %>%
pull(id)
}
# strip out everything after song name and artist due to featuring artists
ranking_ids <- rankings %>%
mutate(search_query = paste(title, artist),
search_query = str_to_lower(search_query),
search_query = str_remove(search_query, "ft.*$")) %>%
mutate(id = map_chr(search_query, possibly(pull_id, NA_character_)))Following is the dataframe of the song rankings with the associated Spotify IDs.
ranking_ids %>%
show_table(head = 20, scroll = TRUE, caption = "Dataframe of song rankings with Spotify IDs (first 20 rows)")| ID | title | artist | year | gender | points | n | n1 | n2 | n3 | n4 | n5 | search_query | id |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Juicy | The Notorious B.I.G. | 1994 | male | 140 | 18 | 9 | 3 | 3 | 1 | 2 | juicy the notorious b.i.g. | 5ByAIlEEnxYdvpnezg7HTX |
| 2 | Fight The Power | Public Enemy | 1989 | male | 100 | 11 | 7 | 3 | 1 | 0 | 0 | fight the power public enemy | 1yo16b3u0lptm6Cs7lx4AD |
| 3 | Shook Ones (Part II) | Mobb Deep | 1995 | male | 94 | 13 | 4 | 5 | 1 | 1 | 2 | shook ones (part ii) mobb deep | 4nASzyRbzL5qZQuOPjQfsj |
| 4 | The Message | Grandmaster Flash & The Furious Five | 1982 | male | 90 | 14 | 5 | 3 | 1 | 0 | 5 | the message grandmaster flash & the furious five | 5DuTNKFEjJIySAyJH1yNDU |
| 5 | Nuthin’ But A ‘G’ Thang | Dr Dre ft. Snoop Doggy Dogg | 1992 | male | 84 | 14 | 2 | 4 | 2 | 4 | 2 | nuthin’ but a ‘g’ thang dr dre | 4YtoipFgf4k0AfD17ZfD5X |
| 6 | C.R.E.A.M. | Wu-Tang Clan | 1993 | male | 62 | 10 | 3 | 1 | 1 | 4 | 1 | c.r.e.a.m. wu-tang clan | 119c93MHjrDLJTApCVGpvx |
| 7 | 93 ’Til Infinity | Souls of Mischief | 1993 | male | 50 | 7 | 2 | 2 | 2 | 0 | 1 | 93 ’til infinity souls of mischief | 0PV1TFUMTBrDETzW6KQulB |
| 8 | Passin’ Me By | The Pharcyde | 1992 | male | 48 | 6 | 3 | 2 | 0 | 0 | 1 | passin’ me by the pharcyde | 4G3dZN9o3o2X4VKwt4CLts |
| 9 | N.Y. State Of Mind | Nas | 1994 | male | 46 | 7 | 1 | 3 | 1 | 1 | 1 | n.y. state of mind nas | 5zwz05jkQVT68CjUpPwFZe |
| 10 | Dear Mama | 2Pac | 1995 | male | 42 | 6 | 2 | 1 | 1 | 2 | 0 | dear mama 2pac | 6tDxrq4FxEL2q15y37tXT9 |
| 11 | Runaway | Kanye West ft. Pusha T | 2010 | male | 38 | 5 | 2 | 0 | 3 | 0 | 0 | runaway kanye west | 3DK6m7It6Pw857FcQftMds |
| 12 | Paid In Full | Eric B & Rakim | 1987 | male | 36 | 5 | 1 | 1 | 3 | 0 | 0 | paid in full eric b & rakim | 0SwuCcwpFM6x4cu5zOvmi0 |
| 13 | Rapper’s Delight | Sugarhill Gang | 1979 | male | 36 | 4 | 2 | 2 | 0 | 0 | 0 | rapper’s delight sugarhill gang | 0FWhGmPVxLI6jOVF0wjALa |
| 14 | They Reminisce Over You (T.R.O.Y.) | Pete Rock & C.L. Smooth | 1992 | male | 34 | 6 | 1 | 0 | 2 | 3 | 0 | they reminisce over you (t.r.o.y.) pete rock & c.l. smooth | 2Mb3zpobD0CvJGWv6NpsPy |
| 15 | Fuck Tha Police | NWA | 1988 | male | 32 | 5 | 1 | 1 | 2 | 0 | 1 | fuck tha police nwa | 5n8Aro6j1bEGIy7Tpo7FV7 |
| 16 | Electric Relaxation | A Tribe Called Quest | 1993 | male | 32 | 5 | 0 | 3 | 1 | 0 | 1 | electric relaxation a tribe called quest | 0eEXcw3JLVXcRxYrVYMy68 |
| 17 | B.O.B. | OutKast | 2000 | male | 32 | 4 | 2 | 1 | 0 | 1 | 0 | b.o.b. outkast | 3WibbMr6canxRJXhNtAvLU |
| 18 | It Was A Good Day | Ice Cube | 1992 | male | 30 | 5 | 2 | 0 | 1 | 0 | 2 | it was a good day ice cube | 2qOm7ukLyHUXWyR4ZWLwxA |
| 19 | U.N.I.T.Y. | Queen Latifah | 1993 | female | 30 | 5 | 1 | 1 | 1 | 1 | 1 | u.n.i.t.y. queen latifah | 3mmbJnh1L94Zl8QZcUTq39 |
| 20 | Doo Wop (That Thing) | Lauryn Hill | 1998 | female | 28 | 5 | 1 | 1 | 1 | 0 | 2 | doo wop (that thing) lauryn hill | 0uEp9E98JB5awlA084uaIg |
We managed to associate with a Spotify ID more than 94% of the songs. For eighteen of them, no correspondence was found, probably either due to the absence of the song on the Spotify catalog or because of some slight differences in the song titles.
ranking_ids %>%
filter(is.na(id)) %>%
select(title, artist, year) %>%
show_table(caption = "Songs without a Spotify ID and discarded from the analysis", align = c("l", "l", "r"))| title | artist | year |
|---|---|---|
| Wu-Tang Clan Ain’t Nuthing Ta Fuck Wit | Wu-Tang Clan | 1993 |
| Double Trouble At The Amphitheatre | Double Trouble | 1983 |
| Ain’t No N*gga | JAY-Z ft. Foxy Brown | 1996 |
| Self Destruction | Stop The Violence Movement | 1989 |
| Soweto | ProKid | 2005 |
| The Bridge Is Over | BDP | 1987 |
| Beat Bop | Rammellzee & K Rob | 1983 |
| Atrevido | Orishas | 2000 |
| Cartoons & Cereal | Kendrick Lamar ft. Gunplay | 2012 |
| Mo(u)rning | Akua Naru | 2012 |
| Vicious Rap | Tanya ‘Sweet Tee’ Winley | 1980 |
| La Di Da Di | Doug E Fresh & The Get Fresh Crew | 1985 |
| Ojuelegba (Remix) | Wizkid ft. Drake & Skepta | 2015 |
| Play That Beat Mr DJ | GLOBE & Whiz Kid | 2001 |
| It Was A Good Day (B-Side Remix version) | Ice Cube | 1994 |
| Mtaktak | Shabjdeed & Al Nather | 2019 |
| Ngqangqa | Kanyi | 2017 |
| Shinjitsu No Dangan | King Giddra | 1995 |
Now that we have associated the greatest Hip Hop songs with their Spotify IDs, we can get the audio features for the individual tracks. Because the functions can handle a limited set of IDs at a time, we divide the data frame into folds and perform the operation on each subset.
# https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/
ranking_features <- ranking_ids %>%
mutate(id_group = row_number() %/% 80) %>%
select(id_group, id) %>%
nest(data = c(id)) %>%
mutate(audio_features = purrr::map(data, ~get_track_audio_features(.$id)))
# https://developer.spotify.com/documentation/web-api/reference/tracks/get-track/
ranking_tracks <- ranking_ids %>%
mutate(id_group = row_number() %/% 50) %>%
select(id_group, id) %>%
nest(data = c(id)) %>%
mutate(track_features = purrr::map(data, ~get_tracks(.$id)))We create the rankings_df data frame by joining the song IDs with their audio and track features, and the URLs of their album covers.
audio_features <- ranking_features %>%
select(audio_features) %>%
unnest(audio_features) %>%
select(danceability:tempo, duration_ms, time_signature)
track_features <- ranking_tracks %>%
select(track_features) %>%
unnest(track_features) %>%
select(explicit, popularity)
urls <- ranking_tracks %>%
unnest(track_features) %>%
pull(album.images) %>%
purrr::map(function(x){ ifelse(is.null(x), NA, x[1,2])}) %>%
unlist() %>%
tibble(url = .)
rankings_df <- ranking_ids %>%
bind_cols(audio_features, track_features, urls) %>%
select(title, artist, points, year, gender, danceability : popularity, url) %>%
na.omit()
rankings_df %>%
show_table(caption = "Dataframe of song rankings with audio and track features (first 20 rows)",
scroll = TRUE, head = 20)| title | artist | points | year | gender | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | explicit | popularity | url |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Juicy | The Notorious B.I.G. | 140 | 1994 | male | 0.89 | 0.82 | 9 | -4.67 | 1 | 0.25 | 0.47 | 0.00 | 0.20 | 0.78 | 96.06 | 302693 | 4 | TRUE | 76 | https://i.scdn.co/image/ab67616d0000b273a4950162a626593b7340f6c7 |
| Fight The Power | Public Enemy | 100 | 1989 | male | 0.80 | 0.58 | 2 | -12.97 | 1 | 0.26 | 0.00 | 0.00 | 0.52 | 0.42 | 105.97 | 282640 | 4 | FALSE | 66 | https://i.scdn.co/image/ab67616d0000b2732e3d1de8b2f61a477ae1ed6c |
| Shook Ones (Part II) | Mobb Deep | 94 | 1995 | male | 0.64 | 0.88 | 6 | -5.51 | 1 | 0.37 | 0.08 | 0.00 | 0.12 | 0.65 | 94.92 | 256333 | 4 | TRUE | 42 | https://i.scdn.co/image/ab67616d0000b273086d14bc1c05200680d290c9 |
| The Message | Grandmaster Flash & The Furious Five | 90 | 1982 | male | 0.95 | 0.61 | 10 | -10.58 | 0 | 0.20 | 0.02 | 0.00 | 0.09 | 0.73 | 100.62 | 431800 | 4 | FALSE | 50 | https://i.scdn.co/image/ab67616d0000b273798575ed938d0968a00ce887 |
| Nuthin’ But A ‘G’ Thang | Dr Dre ft. Snoop Doggy Dogg | 84 | 1992 | male | 0.80 | 0.70 | 11 | -8.18 | 0 | 0.28 | 0.01 | 0.00 | 0.15 | 0.66 | 94.61 | 238467 | 4 | TRUE | 71 | https://i.scdn.co/image/ab67616d0000b273dc20ba099bc933674f58ebae |
| C.R.E.A.M. | Wu-Tang Clan | 62 | 1993 | male | 0.48 | 0.55 | 11 | -10.55 | 0 | 0.37 | 0.57 | 0.02 | 0.13 | 0.58 | 180.99 | 252187 | 4 | TRUE | 73 | https://i.scdn.co/image/ab67616d0000b2735901aaa980d3e714bf01171c |
| 93 ’Til Infinity | Souls of Mischief | 50 | 1993 | male | 0.59 | 0.67 | 1 | -11.79 | 1 | 0.41 | 0.12 | 0.00 | 0.15 | 0.69 | 206.25 | 286440 | 4 | FALSE | 68 | https://i.scdn.co/image/ab67616d0000b27343969ecfe687484121805478 |
| Passin’ Me By | The Pharcyde | 48 | 1992 | male | 0.76 | 0.76 | 4 | -8.14 | 0 | 0.27 | 0.09 | 0.00 | 0.26 | 0.61 | 87.06 | 303493 | 4 | FALSE | 67 | https://i.scdn.co/image/ab67616d0000b2739ec4abd35652fafe34ee7dfb |
| N.Y. State Of Mind | Nas | 46 | 1994 | male | 0.66 | 0.91 | 6 | -4.68 | 0 | 0.22 | 0.04 | 0.00 | 0.23 | 0.89 | 84.10 | 293973 | 4 | TRUE | 66 | https://i.scdn.co/image/ab67616d0000b27375d9ecf8d29744037d2d6064 |
| Dear Mama | 2Pac | 42 | 1995 | male | 0.77 | 0.54 | 6 | -7.12 | 1 | 0.10 | 0.37 | 0.00 | 0.13 | 0.32 | 84.11 | 280000 | 4 | TRUE | 73 | https://i.scdn.co/image/ab67616d0000b27304b9ab6bd4bf6a350ba902ea |
| Runaway | Kanye West ft. Pusha T | 38 | 2010 | male | 0.57 | 0.57 | 1 | -3.83 | 0 | 0.07 | 0.22 | 0.00 | 0.51 | 0.11 | 86.99 | 547733 | 4 | TRUE | 72 | https://i.scdn.co/image/ab67616d0000b2735bd363295a677dacd0b4187b |
| Paid In Full | Eric B & Rakim | 36 | 1987 | male | 0.85 | 0.71 | 11 | -9.50 | 0 | 0.34 | 0.16 | 0.00 | 0.10 | 0.75 | 104.21 | 232760 | 4 | FALSE | 54 | https://i.scdn.co/image/ab67616d0000b273d9a71c225d94b8cb6638bb91 |
| Rapper’s Delight | Sugarhill Gang | 36 | 1979 | male | 0.90 | 0.84 | 11 | -7.31 | 0 | 0.26 | 0.46 | 0.00 | 0.07 | 0.91 | 111.19 | 427360 | 4 | FALSE | 55 | https://i.scdn.co/image/ab67616d0000b2731e11df565134deafff674368 |
| They Reminisce Over You (T.R.O.Y.) | Pete Rock & C.L. Smooth | 34 | 1992 | male | 0.76 | 0.68 | 2 | -10.89 | 1 | 0.16 | 0.14 | 0.00 | 0.34 | 0.67 | 101.79 | 286693 | 4 | FALSE | 60 | https://i.scdn.co/image/ab67616d0000b273d535e6a05ec517c25a232f43 |
| Fuck Tha Police | NWA | 32 | 1988 | male | 0.86 | 0.75 | 8 | -8.32 | 0 | 0.30 | 0.02 | 0.00 | 0.05 | 0.86 | 98.69 | 345717 | 4 | TRUE | 72 | https://i.scdn.co/image/ab67616d0000b273c79a70e8167cc1a4fab83781 |
| Electric Relaxation | A Tribe Called Quest | 32 | 1993 | male | 0.87 | 0.53 | 11 | -9.20 | 1 | 0.23 | 0.18 | 0.27 | 0.09 | 0.84 | 98.24 | 226133 | 4 | FALSE | 66 | https://i.scdn.co/image/ab67616d0000b273b1ddb9f5f519e0d37bc94a53 |
| B.O.B. | OutKast | 32 | 2000 | male | 0.75 | 0.98 | 9 | -5.32 | 1 | 0.10 | 0.06 | 0.00 | 0.03 | 0.65 | 153.90 | 304227 | 4 | TRUE | 62 | https://i.scdn.co/image/ab67616d0000b273c2c10aadaee962bb683354fc |
| It Was A Good Day | Ice Cube | 30 | 1992 | male | 0.80 | 0.74 | 7 | -5.33 | 0 | 0.14 | 0.33 | 0.00 | 0.29 | 0.79 | 82.36 | 260000 | 4 | TRUE | 74 | https://i.scdn.co/image/ab67616d0000b273994c319841a923465d62e271 |
| U.N.I.T.Y. | Queen Latifah | 30 | 1993 | female | 0.66 | 0.74 | 4 | -4.08 | 0 | 0.36 | 0.01 | 0.00 | 0.32 | 0.52 | 93.80 | 251907 | 4 | FALSE | 52 | https://i.scdn.co/image/ab67616d0000b2739c829d12d7d599735fb1f437 |
| Doo Wop (That Thing) | Lauryn Hill | 28 | 1998 | female | 0.54 | 0.50 | 2 | -8.93 | 0 | 0.24 | 0.04 | 0.00 | 0.09 | 0.50 | 99.94 | 320267 | 4 | FALSE | 73 | https://i.scdn.co/image/ab67616d0000b273e08b1250db5f75643f1508c9 |
Let’s have a look at the audio features of the tracks.
Time signature
The most common time signature is 4/4, that is, when the song has four quarter note beats. “Monster” by Kanye West is the only song in 5/4, whereas “Love Yourz” by J Cole is the only one in 1/4.
# skim(rankings_df)
rankings_df %>%
frequency_table(time_signature, caption = "Time signature of the greatest Hip Hop songs")| Time_signature | Frequency | Proportion |
|---|---|---|
| 4 | 287 | 98.0% |
| 3 | 4 | 1.4% |
| 1 | 1 | 0.3% |
| 5 | 1 | 0.3% |
Mode
Most of the songs are in major mode, which is often associated with feelings of positivity and happiness.
rankings_df %>%
mutate(mode = fct_recode(as.factor(mode), Major = "1", Minor = "0")) %>%
frequency_table(mode,
caption = "Mode distribution of the best Hip Hop songs",
align = c("l", "r", "r"))| Mode | Frequency | Proportion |
|---|---|---|
| Major | 176 | 60.1% |
| Minor | 117 | 39.9% |
Explicit content
Most of the songs have explicit lyrics. Greg Beato of the magazine “Reason” observed that by the 1990s, “A hip-hop album that didn’t warrant a Tipper (the co-founder of the center commonly credited with beginning movements for the Parental Advisory label, ed.) sticker was artistically suspect”.
rankings_df %>%
frequency_table(explicit, caption = "Distribution of explicit content in songs lyrics",
align = c("l", "r", "r"))| Explicit | Frequency | Proportion |
|---|---|---|
| TRUE | 213 | 72.7% |
| FALSE | 80 | 27.3% |
rankings_df_tall <- rankings_df %>%
select(- c(title:gender, mode, time_signature, explicit, url)) %>%
mutate(duration_ms = as.double(duration_ms),
popularity = as.double(popularity)) %>%
pivot_longer(
cols = c(danceability:duration_ms, popularity),
names_to = "feature",
values_to = "value"
)
histogram_features <- rankings_df_tall %>%
mutate(feature = factor(feature, levels = unique(rankings_df_tall$feature))) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, fill = "#404664", alpha = 0.7) +
facet_wrap(~feature, scales = "free") +
labs(x = "", y = "Density",
title = "Audio features of the greatest Hip Hop songs",
subtitle = "Hip hop songs tend to be danceable, energic, loud, speechy, acoustic, with a low key and a positive meaning.",
caption = "source: BBC Music, TidyTuesday 2020|week 16, Spotify API") +
theme_light() +
theme(text = element_text(family = "Montaga"),
plot.background = element_rect(fill = mygray),
strip.text = element_text(size = 17),
strip.background = element_rect(fill = mygray),
strip.text.x = element_text(colour = "black"),
plot.title = element_text(size = 25),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 14),
axis.text = element_text(size = 10),
axis.title.y = element_text(size = 20))
ggsave(here("2020", "week16", "Plots", "Histogram_features.pdf"), plot = histogram_features,
width = 11, height = 9, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Histogram_features.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Histogram_features.png"),
verbose = FALSE)Let us quantify the relationships existing among these features through a correlation plot. Most of the correlations are close to zero, but some pairs of variables are fairly correlated. It is the case of energy and loudness, that is, energetic songs tend to be loud and the other way around. The variable year is positively correlated with loudness, and negatively correlated with danceability and valence. This means that more recent songs tend to be louder, sadder, and less danceable.
corrmat <- rankings_df %>%
select(year, danceability:tempo, popularity) %>%
cor() %>%
ggcorrplot(type = "lower", colors = c("#6D9EC1", "white", "#E46726"),
outline.col = "black", legend.title = "Correlation",
lab = TRUE, ggtheme = ggplot2::theme_light(), lab_size = 3.8) +
guides(fill = guide_colorbar(label = TRUE, frame.colour = "black", ticks = TRUE)) +
labs(title = "Correlation matrix of the songs audio features",
caption = "source: BBC Music, TidyTuesday 2020|week 16, Spotify API") +
theme(text = element_text(family = "Montaga"),
plot.background = element_rect(fill = mygray),
plot.title = element_text(size = 19),
plot.caption = element_text(size = 10),
axis.text = element_text(size = 14, colour = "black"),
legend.background = element_blank(),
legend.text = element_text(size = 9),
legend.title = element_text(size = 14))
ggsave(here("2020", "week16", "Plots", "Correlation_matrix.pdf"), plot = corrmat,
width = 7.8, height = 6.77, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Correlation_matrix.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Correlation_matrix.png"),
verbose = FALSE)Exploratory Factor Analysis
We can perform an exploratory factor analysis to find a small number of factors that can explain the relationships among the song features. Since factor analysis requires continuous variables, we remove the binary variables gender, explicit, and time_signature. We also drop tempo given that it is already included in danceability, and duration_ms.
rankings_fa <- rankings_df %>%
select(-c(title, artist, url, popularity, gender, explicit, mode,
time_signature, tempo, duration_ms, points))The first step is determining the number of common factors to extract. Based on the heuristics summarized in the plot below, five factors seem to be sufficient.
ap <- parallel(subject = nrow(rankings_fa),var = ncol(rankings_fa), rep=100, cent = .05)
nS <- nScree(x = eigen(cor(rankings_fa))$values, aparallel = ap$eigen$qevpea)
# adapted from plotnScree(nS) function ---------------------------------------------
eig <- nS$Analysis$Eigenvalues
nk <- length(eig); k <- 1:nk; noc <- nS$Components$noc
vp.p <- lm(eig[c(noc + 1, nk)] ~ k[c(noc + 1, nk)])
leg.txt <- c(paste0("Eigenvalues (> mean = ", nS$Components$nkaiser, ")"),
paste0("Parallel Analysis (n = ", nS$Components$nparallel, ")"))
screeplot <- data.frame(component = rep(1:length(eig), 2),
group = c(rep("eigen", 10), rep("parallel", 10)),
value = c(eig, nS$Analysis$Par.Analysis)) %>%
ggplot(aes(x = component, y = value, group = group)) +
scale_x_continuous(breaks = seq(1, nk, by = 1)) +
geom_point(aes(shape = group, colour = group), size = 3) +
geom_line(aes(linetype=group, color = group), size = 1.4) +
scale_linetype_manual(name = "Method", values = c("solid", "dotted"), label = leg.txt) +
scale_color_manual(name = "Method", values = c("#3d405b", "#43aa8b"), label = leg.txt) +
scale_shape_manual(name = "Method", values = c(19,17), label = leg.txt) +
annotate("segment", x = k[c(1, nk)][1], xend = k[c(1, nk)][2],
y = sum(c(1, 1) * coef(vp.p)), yend = sum(c(1, nk) * coef(vp.p)),
color = "#e07a5f", size = 1.4) +
labs(y = "Eigenvalues", x = "Components",
title = "Empirical methods for determining the number of factors",
subtitle = "The red line determines the optimal coordinates.",
caption = "source: BBC Music, TidyTuesday 2020|week 16, Spotify API") +
theme(legend.position = c(0.8,0.93),
panel.background = element_rect(fill = "white"),
panel.grid = element_line(colour = "grey87"),
panel.border = element_rect(colour = "grey70", fill = NA),
plot.title = element_text(size = 22))
ggsave(here("2020", "week16", "Plots", "Screeplot.pdf"), plot = screeplot,
width = 10, height = 10, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Screeplot.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Screeplot.png"),
verbose = FALSE)Let’s fit a factor analysis model with 4 factors.
nf <- nS$Components$nkaiser
fa <- fa(rankings_fa, nfactors = nf, rotate = "varimax")
colnames(fa$loadings) <- paste0("Factor", 1:nf)
fa## Factor Analysis using method = minres
## Call: fa(r = rankings_fa, nfactors = nf, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## Factor1 Factor2 Factor3 Factor4 h2 u2 com
## year 0.17 -0.64 0.17 -0.02 0.47 0.53 1.3
## danceability -0.10 0.37 -0.68 0.07 0.61 0.39 1.6
## energy 0.81 0.22 0.08 -0.05 0.71 0.29 1.2
## key -0.02 0.26 0.02 0.19 0.10 0.90 1.9
## loudness 0.82 -0.29 -0.10 -0.13 0.78 0.22 1.3
## speechiness -0.10 0.18 0.38 -0.18 0.22 0.78 2.0
## acousticness -0.02 -0.03 0.33 0.01 0.11 0.89 1.0
## instrumentalness -0.09 0.02 -0.04 0.57 0.33 0.67 1.1
## liveness 0.02 -0.08 0.31 0.03 0.10 0.90 1.2
## valence 0.20 0.67 -0.14 -0.08 0.51 0.49 1.3
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 1.42 1.23 0.88 0.42
## Proportion Var 0.14 0.12 0.09 0.04
## Cumulative Var 0.14 0.27 0.35 0.40
## Proportion Explained 0.36 0.31 0.22 0.11
## Cumulative Proportion 0.36 0.67 0.89 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 4 factors are sufficient.
##
## The degrees of freedom for the null model are 45 and the objective function was 1.56 with Chi Square of 450.4
## The degrees of freedom for the model are 11 and the objective function was 0.09
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.06
##
## The harmonic number of observations is 293 with the empirical chi square 21.11 with prob < 0.032
## The total number of observations was 293 with Likelihood Chi Square = 26.88 with prob < 0.0048
##
## Tucker Lewis Index of factoring reliability = 0.838
## RMSEA index = 0.072 and the 90 % confidence intervals are 0.037 0.105
## BIC = -35.61
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## Factor1 Factor2 Factor3
## Correlation of (regression) scores with factors 0.91 0.84 0.78
## Multiple R square of scores with factors 0.84 0.71 0.61
## Minimum correlation of possible factor scores 0.67 0.42 0.22
## Factor4
## Correlation of (regression) scores with factors 0.61
## Multiple R square of scores with factors 0.37
## Minimum correlation of possible factor scores -0.26
Let us visualize the contributions of each variable for measuring each common factor. The first factor is a measure of the energy and loudness of the song, the second one of the year of release of the song, and the valence, that is, how happy the song sounds. The third factor contrasts the danceability of the song to its acousticness, speachiness, and liveness. In contrast, the last factor is related to the musical characteristics of the song, such as instrumentalness and key.
tidied_fa <- fa$loadings[] %>%
as.data.frame() %>%
pivot_longer(
cols = 1:nf,
names_to = "factor",
values_to = "value"
) %>%
mutate(factor = fct_inorder(factor),
terms = rep(rownames(fa$loadings[]), each = length(unique(factor)))) %>%
select(terms, value, factor) %>%
arrange(factor)
cols.extended <- rev(colorRampPalette(c(c("#404664", "#726CC6", "#AAA7DD",
"#D3D3EE", "#e0afa0", "#FBE8DA")))(nrow(fa$loadings)))
loadings <- tidied_fa %>%
ggplot(aes(value, terms, fill = value)) +
geom_col(color = "black") +
facet_wrap(~factor) +
labs(x = "Loading value", y = NULL, title = "Loadings from the 4-factor analysis model",
caption = "source: BBC Music, TidyTuesday 2020|week 16, Spotify API") +
scale_fill_gradientn(name = "Factor loading value\n", colors = cols.extended,
guide = guide_colorbar(label = TRUE, draw.ulim = TRUE, draw.llim = TRUE,
frame.colour = "black", ticks = TRUE,
label.position = "bottom", barwidth = 7,
barheight = 1.3, direction = 'horizontal')) +
theme(plot.title = element_text(size = 22),
panel.background = element_rect(fill = "white"),
panel.grid = element_line(colour = "grey87"),
panel.border = element_rect(fill = NA, colour = mygray),
strip.background = element_rect(fill = mygray))
ggsave(here("2020", "week16", "Plots", "Loadings.pdf"), plot = loadings,
width = 12, height = 10, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Loadings.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Loadings.png"),
verbose = FALSE)Lastly, we plot the factor scores of the songs along the first two factors. The points seem to be smoothly distributed. On the left, we find quiet and/or calm songs that become progressively louder and/or more energetic as one moves to the right. The songs at the top are old and positive, whereas at the bottom we find newer and/or more negative songs.
factor_scores <- fa$scores %>%
as_tibble() %>%
magrittr::set_colnames(paste0("Factor", 1:nf))
factor_scores_f1f2 <- rankings_df %>%
select(title) %>%
bind_cols(factor_scores %>%
select(1:2))
songs_extrema <- rankings_df %>%
select(title, artist, loudness, energy, year, valence, danceability, key,
speechiness, acousticness, instrumentalness, liveness) %>%
filter(title!="Learned from Texas") %>%
slice(which.max(loudness), which.min(loudness),
which.max(energy), which.min(energy),
which.max(year), which.min(year),
which.max(valence), which.min(valence),
which.max(danceability), which.min(danceability),
which.max(speechiness), which.min(speechiness),
which.max(acousticness), which.min(acousticness),
which.max(liveness), which.min(liveness)) %>%
add_column(feature = c("Loudest", "Quietest", "Most energic", "Calmest",
"Most recent", "Oldest", "Most positive", "Most negative",
"Most danceable", "Least danceable", "Most speechy",
"Least speechy", "Most acoustic", "Least acoustic",
"Most likely live", "Least likely live"))
df_repel <- factor_scores_f1f2 %>%
left_join(songs_extrema %>%
left_join(factor_scores_f1f2, by = "title") %>%
slice(1:8),
by = c("title", "Factor1", "Factor2")) %>%
mutate(label = ifelse(!is.na(feature), feature, ""),
xmin = Factor1 - nchar(title)/(11 * 10),
xmax = Factor1 + nchar(title)/(11 * 10),
ymin = Factor2 - 0.07,
ymax = Factor2 + 0.07)
plot.factor_scores <- factor_scores_f1f2 %>%
ggplot(aes(Factor1, Factor2, label = title)) +
geom_text(check_overlap = TRUE, family = "Montaga") +
geom_label_repel(
data = df_repel[df_repel$title != "Old Town Road (Remix)",],
aes(Factor1, Factor2, label = label),
min.segment.length = 0.3,
family = "Montaga", force = 1, size = 4, point.padding = 0.3, box.padding = 0.6,
color = "#2343E7", inherit.aes = FALSE
) +
# fix label for Old Town Road ------
geom_label_repel(
data = df_repel[df_repel$title == "Old Town Road (Remix)",],
aes(Factor1, Factor2, label = label),
min.segment.length = 0.3, family = "Montaga", force = 1, size = 4,
point.padding = 0.3, box.padding = 0.6,
color = "#2343E7", inherit.aes = FALSE, xlim = c(-0.5, -0.3), ylim = c(-0.9, -0.8)
) +
coord_cartesian(ylim = c(-2.3, 2.1), xlim = c(-3.1, 1.7), clip="off") +
labs(x = "Factor 1", y = "Factor 2",
title = "How the Hip Hop songs are placed on the two-dimensional factor subspace",
subtitle = "First two common factors") +
annotate("segment", x = 0, xend = 1.3, y = -2.9, yend = -2.9, arrow = arrow(length=unit(0.3, "cm"))) +
annotate("text", x = 0.6, y = -2.8, label = "Louder and/or more energetic songs",
size = 5, family = "Montaga") +
annotate("segment", x = -3.5, xend = -3.5, y = 0.25, yend = 1.9, arrow = arrow(length=unit(0.3, "cm"))) +
annotate("text", x = -3.6, y = 1.1, label = "Older and/or more positive songs",
size = 5, angle = 90, family = "Montaga") +
annotate("text", label = "source: BBC Music, TidyTuesday 2020|week 16, Spotify API",
x = 1, y = -3.1, family = "Montaga") +
theme(plot.margin = unit(c(0.2,1.7,1.5,1.7), "cm"),
panel.background = element_rect(fill = "white"),
panel.grid = element_line(colour = "grey87"),
panel.border = element_rect(colour = "grey70", fill = NA),
plot.title = element_text(size = 22),
axis.text = element_text(size = 15),
axis.title = element_text(size = 18)) +
draw_image(image = here("2020", "week16", "Images", "headphones.png"),
x = 1.67, y = -2.76, hjust = .5, vjust = .5, width = 0.4) +
draw_image(image = here("2020", "week16", "Images", "gramophone.png"),
x = -3.5, y = 2.28, hjust = .5, vjust = .5, width = 0.4) +
draw_image(image = here("2020", "week16", "Images", "star.png"),
x = -3.7, y = 2.28, hjust = .5, vjust = .3, width = 0.2)
ggsave(here("2020", "week16", "Plots", "Factor_scores.pdf"), plot = plot.factor_scores,
width = 15, height = 10, device = cairo_pdf)
png <- pdf_convert(here("2020", "week16", "Plots", "Factor_scores.pdf"), dpi = 400,
filenames = here("2020", "week16", "Plots", "Factor_scores.png"),
verbose = FALSE)